#title: "Data Preparation & Summary Stats"
#author: "Qianhui Li"
setwd("/Users/qianhuili/Desktop/GitHub/AAE724/Script/Data_cleaning")
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(leaps)
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loading required package: foreach
## Loaded glmnet 2.0-18
library(ggplot2)
library(gmodels)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(corrplot)
## corrplot 0.84 loaded
library(ISLR)
library(tree)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(rpart)
library(rpart.plot)
library(rattle)
## Rattle: A free graphical interface for data science with R.
## Version 5.2.0 Copyright (c) 2006-2018 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following object is masked from 'package:gmodels':
##
## ci
## The following object is masked from 'package:glmnet':
##
## auc
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(corrplot)
library(lfe)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(tidyverse)
## Registered S3 method overwritten by 'cli':
## method from
## print.tree tree
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ tibble 2.1.3 ✔ purrr 0.3.2
## ✔ readr 1.3.1 ✔ stringr 1.4.0
## ✔ tibble 2.1.3 ✔ forcats 0.4.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ purrr::accumulate() masks foreach::accumulate()
## ✖ gridExtra::combine() masks dplyr::combine()
## ✖ Matrix::expand() masks tidyr::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ Matrix::pack() masks tidyr::pack()
## ✖ car::recode() masks dplyr::recode()
## ✖ MASS::select() masks dplyr::select()
## ✖ purrr::some() masks car::some()
## ✖ Matrix::unpack() masks tidyr::unpack()
## ✖ purrr::when() masks foreach::when()
library(viridis)
## Loading required package: viridisLite
library(RColorBrewer)
library(ggpubr)
## Loading required package: magrittr
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
library(wesanderson)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:MASS':
##
## select
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(corrplot)
library(ROSE)
## Loaded ROSE 0.0-3
library(naniar)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(blorr)
library(pROC)
#=============================================
##Data Preparation
bankoriginal<-read.csv("bank_data.csv",header=TRUE, sep=";", na.strings=c("unknown","non-existent"))
#Check # & % of missing values
gg_miss_var(bankoriginal)

gg_miss_var(bankoriginal, show_pct = TRUE)

#Since there is "999" in pdays means client was not previously contacted, I convert pdays into a dummy variable, never contacted(999)=0,others=1.
bankoriginal$pdays <-as.factor(bankoriginal$pdays)
bankoriginal$pdays <-ifelse(bankoriginal$pdays==999,0,1)
#The first variable that has the largest proportion of missing values is "default",
#However, it may be possible that customer is not willing to disclose this information to the banking representative.
#Hence the unknown value in 'default' is actually a separate value.
#Thus I kept the variable "default", and I think it also make sense for "loan" and "housing" loan variable
bankoriginal$default <- as.character(bankoriginal$default)
bankoriginal$default[is.na(bankoriginal$default)] <- "refuse2disclose"
bankoriginal$loan<-as.character(bankoriginal$loan)
bankoriginal$loan[is.na(bankoriginal$loan)] <- "refuse2disclose"
bankoriginal$housing<-as.character(bankoriginal$housing)
bankoriginal$housing[is.na(bankoriginal$housing)] <- "refuse2disclose"
#As indicated by the data contributor, the duration is not known before a call is performed.
#Also, after the end of the call y is obviously known.
#Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model.
#Thus I removed "duration"
bankoriginal = bankoriginal %>%
select(-duration)
#check for missing value graph again
gg_miss_var(bankoriginal)

gg_miss_var(bankoriginal, show_pct = TRUE)

#omit missing values
bank<-na.omit(bankoriginal)
sum(is.na(bank))
## [1] 0
#Data summary
summary(bank)
## age job marital
## Min. :17.00 admin. :10159 divorced: 4417
## 1st Qu.:32.00 blue-collar: 8788 married :23748
## Median :38.00 technician : 6520 single :11026
## Mean :39.86 services : 3814
## 3rd Qu.:47.00 management : 2798
## Max. :98.00 retired : 1617
## (Other) : 5495
## education default housing
## basic.4y : 4118 Length:39191 Length:39191
## basic.6y : 2264 Class :character Class :character
## basic.9y : 6006 Mode :character Mode :character
## high.school : 9464
## illiterate : 18
## professional.course: 5225
## university.degree :12096
## loan contact month day_of_week
## Length:39191 cellular :24983 may :13128 fri:7417
## Class :character telephone:14208 jul : 6767 mon:8107
## Mode :character aug : 5947 thu:8194
## jun : 5014 tue:7683
## nov : 3973 wed:7790
## apr : 2493
## (Other): 1869
## campaign pdays previous poutcome
## Min. : 1.000 Min. :0.0000 Min. :0.0000 failure : 4044
## 1st Qu.: 1.000 1st Qu.:0.0000 1st Qu.:0.0000 nonexistent:33877
## Median : 2.000 Median :0.0000 Median :0.0000 success : 1270
## Mean : 2.566 Mean :0.0357 Mean :0.1704
## 3rd Qu.: 3.000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :56.000 Max. :1.0000 Max. :7.0000
##
## emp.var.rate cons.price.idx cons.conf.idx euribor3m
## Min. :-3.40000 Min. :92.20 Min. :-50.80 Min. :0.634
## 1st Qu.:-1.80000 1st Qu.:93.08 1st Qu.:-42.70 1st Qu.:1.344
## Median : 1.10000 Median :93.44 Median :-41.80 Median :4.857
## Mean : 0.08324 Mean :93.57 Mean :-40.54 Mean :3.624
## 3rd Qu.: 1.40000 3rd Qu.:93.99 3rd Qu.:-36.40 3rd Qu.:4.961
## Max. : 1.40000 Max. :94.77 Max. :-26.90 Max. :5.045
##
## nr.employed y
## Min. :4964 no :34831
## 1st Qu.:5099 yes: 4360
## Median :5191
## Mean :5167
## 3rd Qu.:5228
## Max. :5228
##
#convert variable types
sapply(bank,class)
## age job marital education default
## "integer" "factor" "factor" "factor" "character"
## housing loan contact month day_of_week
## "character" "character" "factor" "factor" "factor"
## campaign pdays previous poutcome emp.var.rate
## "integer" "numeric" "integer" "factor" "numeric"
## cons.price.idx cons.conf.idx euribor3m nr.employed y
## "numeric" "numeric" "numeric" "numeric" "factor"
#numerical variables
bank$age <- as.numeric(bank$age)
bank$campaign <- as.numeric(bank$campaign)
bank$previous <- as.numeric(bank$previous)
bank$emp.var.rate <- as.numeric(bank$emp.var.rate)
bank$cons.price.idx <- as.numeric(bank$cons.price.idx)
bank$cons.conf.idx <- as.numeric(bank$cons.conf.idx)
bank$euribor3m <- as.numeric(bank$euribor3m)
bank$nr.employed <- as.numeric(bank$nr.employed)
#categorical variables
bank$job <-as.factor(bank$job)
bank$marital <-as.factor(bank$marital)
bank$education <-as.factor(bank$education)
bank$default <-as.factor(bank$default)
bank$loan <-as.factor(bank$loan)
bank$housing<-as.factor(bank$housing)
bank$contact <-as.factor(bank$contact)
bank$poutcome <-as.factor(bank$poutcome)
bank$day_of_week <-as.factor(bank$day_of_week)
bank$month <-as.factor(bank$month)
bank$y<-ifelse(bank$y =='yes',1,0)
bank$y <-as.factor(bank$y)
#Check for outliers for numerical variables
p1_age <- ggplot(bank, aes(y, age)) + geom_boxplot(aes(fill = y))
p1_campaign <- ggplot(bank, aes(y, campaign)) + geom_boxplot(aes(fill = y))
p1_previous <- ggplot(bank, aes(y, previous)) + geom_boxplot(aes(fill = y))
p1_emp.var.rate <- ggplot(bank, aes(y, emp.var.rate)) + geom_boxplot(aes(fill = y))
p1_cons.price.idx <- ggplot(bank, aes(y, cons.price.idx)) + geom_boxplot(aes(fill = y))
p1_cons.conf.idx<- ggplot(bank, aes(y, cons.conf.idx)) + geom_boxplot(aes(fill = y))
p1_euribor3m<- ggplot(bank, aes(y, euribor3m)) + geom_boxplot(aes(fill = y))
p1_nr.employed<- ggplot(bank, aes(y, nr.employed)) + geom_boxplot(aes(fill = y))
a1 <- c(p1_age,p1_campaign)
ggarrange(p1_age,p1_campaign,
nrow = 1)

b1 <- c(p1_previous,p1_emp.var.rate,
p1_cons.price.idx)
ggarrange(p1_previous,p1_emp.var.rate,
p1_cons.price.idx,
nrow = 1)

g1 <- c(p1_cons.conf.idx,
p1_euribor3m,p1_nr.employed)
ggarrange(p1_cons.conf.idx,p1_euribor3m,p1_nr.employed,
nrow = 1)

#age==38770
x <- bank$age
qnt <- quantile(x, probs=c(.25, .75), na.rm = T)
H <- 1.5 * IQR(x, na.rm = T)
hb <- H + qnt[2]
hb #remove>69.5
## 75%
## 69.5
ab <- bank[which(bank$age<hb),]
#campaign==35982
x1 <- bank$campaign
qnt1 <- quantile(x1, probs=c(.25, .75), na.rm = T)
H1 <- 1.5 * IQR(x1, na.rm = T)
hb1<- H1 + qnt1[2]
hb1 #remove>6
## 75%
## 6
ac <- bank[which(bank$campaign<hb1),]
#cons.conf.idx
x5 <- bank$cons.conf.idx
qnt5 <- quantile(x5, probs=c(.25, .75), na.rm = T)
H5 <- 1.5 * IQR(x5, na.rm = T)
hb5<- H5 + qnt5[2]
hb5 #remove>-26.95
## 75%
## -26.95
#From the boxplot for "previous", I decided to treat observations larger than 2 as outliers, thus remove them.
#Result after removing outliers in numerical variables(34,370obs with 20 variables)
bank <- bank[which(bank$age<hb & bank$campaign<hb1 & bank$previous<2 & bank$cons.conf.idx<hb5),]
#Check for outliers for categorical variables with more than 3 categories
pic_job1 <-ggplot(bank, aes(x=job)) + geom_histogram(aes(y=(..count..)), stat='count', fill="slate blue", alpha=0.5) + theme_minimal() +
theme(plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
axis.text.x = element_text(angle = 45, hjust = 1, size=10),
axis.text.y = element_text(size=10)) +
labs(title = "Job",
x="Job", y="Counts")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
pic_job1

#From the histogram, there is no obvious small number of counts for jobs
pic_edu1 <-ggplot(bank, aes(x=education)) + geom_histogram(aes(y=(..count..)), stat='count', fill="yellowgreen", alpha=0.5) + theme_minimal() +
theme(plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
axis.text.x = element_text(angle = 45, hjust = 1, size=10),
axis.text.y = element_text(size=10)) +
labs(title = "Education",
x="Education Status", y="Counts")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
pic_edu1

table(bank$education)
##
## basic.4y basic.6y basic.9y
## 3450 2022 5418
## high.school illiterate professional.course
## 8330 16 4592
## university.degree
## 10542
#From the histogram, there is one obvious tiny number of counts for "illterate"(16 observations)
#Thus I decided to drop obs with "illiterate"
bank <-bank[bank$education!="illiterate",,drop=FALSE]
#After removing outliers for both numerical and categorical variables, there are 34,354 obs with 20 variables.
#Check and adjust data imbalance
counts <- table(bank$y)
barplot(counts,col=c("royalblue3","tomato3"),legend = rownames(counts), main = "Term Deposit")

CrossTable(bank$y)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 34354
##
##
## | 0 | 1 |
## |-----------|-----------|
## | 30933 | 3421 |
## | 0.900 | 0.100 |
## |-----------|-----------|
##
##
##
##
#From the graph and the table, we can see that the dataset is highly imbalanced
#Since most machine learning classification algorithms are sensitive to unbalance in the predictor classes.
#I decided to resample the data by Synthetic Minority Oversampling Technique (SMOTE),
#which is a popular algorithm to creates synthetic observations of the minority class
set.seed(88)
balanced_data <- ROSE(y~., data=bank,seed=1)$data
CrossTable(balanced_data$y)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 34354
##
##
## | 0 | 1 |
## |-----------|-----------|
## | 17245 | 17109 |
## | 0.502 | 0.498 |
## |-----------|-----------|
##
##
##
##
counts1 <- table(balanced_data$y)
barplot(counts1,col=c("royalblue3","tomato3"),legend = rownames(counts), main = "Customers' Responses")

#Hence, we now have 17245(50.2%) "no" responses, and 17109 (49.8%)"yes" responses, thus the data is balanced.
#=============================================
##Summary Statistics
summary(balanced_data)
## age job marital
## Min. : 4.662 admin. :9514 divorced: 3599
## 1st Qu.:31.075 blue-collar:6796 married :20133
## Median :37.897 technician :5838 single :10622
## Mean :39.320 services :3068
## 3rd Qu.:46.925 management :2523
## Max. :82.032 retired :1362
## (Other) :5253
## education default
## basic.4y : 3045 no :28726
## basic.6y : 1919 refuse2disclose: 5627
## basic.9y : 4836 yes : 1
## high.school : 8376
## illiterate : 0
## professional.course: 4668
## university.degree :11510
## housing loan contact
## no :15248 no :28474 cellular :24410
## refuse2disclose: 798 refuse2disclose: 798 telephone: 9944
## yes :18308 yes : 5082
##
##
##
##
## month day_of_week campaign pdays
## may :9978 fri:6312 Min. :-0.8292 Min. :-0.628708
## jul :5427 mon:6892 1st Qu.: 1.0059 1st Qu.:-0.041724
## aug :4872 thu:7456 Median : 1.6486 Median : 0.005368
## jun :4415 tue:6744 Mean : 1.8994 Mean : 0.064230
## nov :3463 wed:6950 3rd Qu.: 2.5750 3rd Qu.: 0.061367
## apr :3275 Max. : 6.8893 Max. : 1.547210
## (Other):2924
## previous poutcome emp.var.rate cons.price.idx
## Min. :-0.80396 failure : 3312 Min. :-6.0150 Min. :91.13
## 1st Qu.:-0.08213 nonexistent:28827 1st Qu.:-1.8975 1st Qu.:92.98
## Median : 0.03874 success : 2215 Median :-0.1352 Median :93.48
## Mean : 0.16018 Mean :-0.3556 Mean :93.48
## 3rd Qu.: 0.20029 3rd Qu.: 1.2431 3rd Qu.:94.00
## Max. : 1.67241 Max. : 4.2551 Max. :95.76
##
## cons.conf.idx euribor3m nr.employed y
## Min. :-59.00 Min. :-2.123 Min. :4834 0:17245
## 1st Qu.:-44.75 1st Qu.: 1.254 1st Qu.:5083 1:17109
## Median :-41.17 Median : 3.658 Median :5167
## Mean :-40.77 Mean : 3.135 Mean :5146
## 3rd Qu.:-36.68 3rd Qu.: 4.901 3rd Qu.:5217
## Max. :-17.42 Max. : 8.384 Max. :5381
##
#categorical variables exploration
pic_job <-ggplot(balanced_data, aes(x=job)) + geom_histogram(aes(y=(..count..)), stat='count', fill="slate blue", alpha=0.5) + theme_minimal() +
theme(plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
axis.text.x = element_text(angle = 45, hjust = 1, size=10),
axis.text.y = element_text(size=10)) +
labs(title = "Job",
x="Job", y="Counts")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
pic_job

#The graph shows that the there are alot of customers work in administritive sector, and the least as entrepreneur.
aa <-ggplot(balanced_data, aes(x = job , fill = y)) +
geom_bar(stat='count', position='dodge')
aa

#The graph shows that there are customers that are admin, retired, or technicial are more willing to accept the offer.
#\\\\\\
pic_marital <-ggplot(balanced_data, aes(x=marital)) + geom_histogram(aes(y=(..count..)), stat='count', fill="light pink", alpha=0.5) + theme_minimal() +
theme(plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
axis.text.x = element_text(angle = 45, hjust = 1, size=10),
axis.text.y = element_text(size=10)) +
labs(title = "Marital",
x="Marital Status", y="Counts")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
pic_marital

bb<-ggplot(balanced_data, aes(x = marital , fill = y)) +
geom_bar(stat='count', position='dodge')
bb

#\\\\\\
pic_edu <-ggplot(balanced_data, aes(x=education)) + geom_histogram(aes(y=(..count..)), stat='count', fill="yellowgreen", alpha=0.5) + theme_minimal() +
theme(plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
axis.text.x = element_text(angle = 45, hjust = 1, size=10),
axis.text.y = element_text(size=10)) +
labs(title = "Education",
x="Education Status", y="Counts")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
pic_edu

cc<-ggplot(balanced_data, aes(x = education , fill = y)) +
geom_bar(stat='count', position='dodge')
cc

#\\\\\\
pic_default <-ggplot(balanced_data, aes(x=default)) + geom_histogram(aes(y=(..count..)), stat='count', fill="light blue", alpha=0.5) + theme_minimal() +
theme(plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
axis.text.x = element_text(angle = 45, hjust = 1, size=10),
axis.text.y = element_text(size=10)) +
labs(title = "Default",
x="Default Status", y="Counts")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
pic_default

dd<-ggplot(balanced_data, aes(x = default , fill = y)) +
geom_bar(stat='count', position='dodge')
dd

#\\\\\\
pic_loan <-ggplot(balanced_data, aes(x=loan)) + geom_histogram(aes(y=(..count..)), stat='count', fill="orange1", alpha=0.5) + theme_minimal() +
theme(plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
axis.text.x = element_text(angle = 45, hjust = 1, size=10),
axis.text.y = element_text(size=10)) +
labs(title = "Loan",
x="Loan Status", y="Counts")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
pic_loan

ee<-ggplot(balanced_data, aes(x = loan , fill = y)) +
geom_bar(stat='count', position='dodge')
ee

#\\\\\\
pic_housing <-ggplot(balanced_data, aes(x=housing)) + geom_histogram(aes(y=(..count..)), stat='count', fill="grey69", alpha=0.5) + theme_minimal() +
theme(plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
axis.text.x = element_text(angle = 45, hjust = 1, size=10),
axis.text.y = element_text(size=10)) +
labs(title = "Housing",
x="Housing Status", y="Counts")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
pic_housing

ff<-ggplot(balanced_data, aes(x = housing , fill = y)) +
geom_bar(stat='count', position='dodge')
ff

#\\\\\\
pic_contact <-ggplot(balanced_data, aes(x=contact)) + geom_histogram(aes(y=(..count..)), stat='count', fill="firebrick", alpha=0.5) + theme_minimal() +
theme(plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
axis.text.x = element_text(angle = 45, hjust = 1, size=10),
axis.text.y = element_text(size=10)) +
labs(title = "Contact",
x="Contact Approach", y="Counts")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
pic_contact

gg<-ggplot(balanced_data, aes(x = contact , fill = y)) +
geom_bar(stat='count', position='dodge')
gg

#\\\\\\
pic_poutcome <-ggplot(balanced_data, aes(x=poutcome)) + geom_histogram(aes(y=(..count..)), stat='count', fill="yellow1", alpha=0.5) + theme_minimal() +
theme(plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
axis.text.x = element_text(angle = 45, hjust = 1, size=10),
axis.text.y = element_text(size=10)) +
labs(title = "poutcome",
x="Previous Outcome", y="Counts")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
pic_poutcome

hh<-ggplot(balanced_data, aes(x = poutcome , fill = y)) +
geom_bar(stat='count', position='dodge')
hh

#\\\\\\
pic_dow <-ggplot(balanced_data, aes(x=day_of_week)) + geom_histogram(aes(y=(..count..)), stat='count', fill="turquoise4", alpha=0.5) + theme_minimal() +
theme(plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
axis.text.x = element_text(angle = 45, hjust = 1, size=10),
axis.text.y = element_text(size=10)) +
labs(title = "Day of Week",
x="Day of Week", y="Counts")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
pic_dow

jj<-ggplot(balanced_data, aes(x = day_of_week , fill = y)) +
geom_bar(stat='count', position='dodge')
jj

#\\\\\\
pic_month <-ggplot(balanced_data, aes(x=month)) + geom_histogram(aes(y=(..count..)), stat='count', fill="darkseagreen4", alpha=0.5) + theme_minimal() +
theme(plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
axis.text.x = element_text(angle = 45, hjust = 1, size=10),
axis.text.y = element_text(size=10)) +
labs(title = "Month",
x="Months", y="Counts")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
pic_month

kk<-ggplot(balanced_data, aes(x = month , fill = y)) +
geom_bar(stat='count', position='dodge')
kk

#\\\\\\
#response variable
pic_y <-ggplot(balanced_data, aes(x=y)) + geom_histogram(aes(y=(..count..)), stat='count', fill="red", alpha=0.5) + theme_minimal() +
theme(plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
axis.text.x = element_text(angle = 45, hjust = 1, size=10),
axis.text.y = element_text(size=10)) +
labs(title = "Subscribe or not",
x="Subscription", y="Counts")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
pic_y

CrossTable(balanced_data$y)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 34354
##
##
## | 0 | 1 |
## |-----------|-----------|
## | 17245 | 17109 |
## | 0.502 | 0.498 |
## |-----------|-----------|
##
##
##
##
#numerical variables exploration
p_age <- ggplot(balanced_data, aes(y, age)) + geom_boxplot(aes(fill = y))
hist(balanced_data$age, col = "yellow2", freq = FALSE)
abline(v = mean(balanced_data$age),
col = "royalblue",
lwd = 2)
abline(v = median(balanced_data$age),
col = "light pink",
lwd = 2)
legend(x = "topright",
c("Density plot", "Mean", "Median"),
col = c("yellow2", "royalblue", "light pink"),
lwd = c(2, 2, 2))

#The distribution shows that most customers oberved are less than 40 years old.
p_campaign <- ggplot(balanced_data, aes(y, campaign)) + geom_boxplot(aes(fill = y))
p_pdays <- ggplot(balanced_data, aes(y, pdays)) + geom_boxplot(aes(fill = y))
p_pdays

p_previous <- ggplot(balanced_data, aes(y, previous)) + geom_boxplot(aes(fill = y))
p_previous

p_emp.var.rate <- ggplot(balanced_data, aes(y, emp.var.rate)) + geom_boxplot(aes(fill = y))
p_cons.price.idx <- ggplot(balanced_data, aes(y, cons.price.idx)) + geom_boxplot(aes(fill = y))
p_cons.conf.idx<- ggplot(balanced_data, aes(y, cons.conf.idx)) + geom_boxplot(aes(fill = y))
p_euribor3m<- ggplot(balanced_data, aes(y, euribor3m)) + geom_boxplot(aes(fill = y))
p_nr.employed<- ggplot(balanced_data, aes(y, nr.employed)) + geom_boxplot(aes(fill = y))
a <- c(p_age,p_campaign,p_pdays)
ggarrange(p_age,p_campaign,
nrow = 1)

b <- c(p_previous,p_emp.var.rate,
p_cons.price.idx)
ggarrange(p_previous,p_emp.var.rate,
p_cons.price.idx,
nrow = 1)

g <- c(p_cons.conf.idx,
p_euribor3m,p_nr.employed)
ggarrange(p_cons.conf.idx,p_euribor3m,p_nr.employed,
nrow = 1)

numericdata <- subset(balanced_data, select=c("age", "campaign","previous","emp.var.rate","cons.price.idx","cons.conf.idx","euribor3m","nr.employed","pdays","previous"))
pairs(numericdata)

M <- cor(numericdata)
corrplot(M, method = "circle")

#or view in corr magnitudes
corrplot(M, method = "number")

#From the correlation plot, we can see that there are good correlations between 'cons.price.idx'&'emp.var.rate', 'cons.conf.idx'&'emp.var.rate',cons.conf.idx'&'cons.price.idx','cons.price.idx'&'nr.employed', cons.conf.idx'&'nr.employed','emp.var.rate'& nr.employed',nr.employed'& euribor3m.
#Those multicollinearity problems may not affect our predictions but indeed affect causal inferences.
#=============================================
#Data Split
index <- createDataPartition(balanced_data$y, p = 0.5, list = FALSE)
train_data <- balanced_data[index, ]
test_data <- balanced_data[-index, ]
#U??Use of one-hot-coding to transfer categorical variables into numerical variables????
#dmy <- dummyVars(" ~ .", data = balanced_data)
#bank.dummies<- data.frame(predict(dmy, newdata = balanced_data))
#print(bank.dummies)
#===========================================================
#Regressions
###logistic
logit_model <- glm(y ~.,family=binomial(link='logit'),data =train_data)
summary(logit_model)
##
## Call:
## glm(formula = y ~ ., family = binomial(link = "logit"), data = train_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.3508 -0.9364 -0.5245 0.9431 2.2262
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.081e+01 4.952e+00 4.202 2.64e-05 ***
## age -8.984e-05 1.932e-03 -0.046 0.962914
## jobblue-collar 2.047e-02 6.616e-02 0.309 0.757005
## jobentrepreneur 1.026e-01 9.851e-02 1.042 0.297406
## jobhousemaid 6.858e-02 1.268e-01 0.541 0.588557
## jobmanagement -6.140e-02 7.384e-02 -0.831 0.405702
## jobretired 2.440e-01 1.083e-01 2.253 0.024283 *
## jobself-employed 2.123e-02 9.545e-02 0.222 0.824008
## jobservices 5.351e-02 6.977e-02 0.767 0.443138
## jobstudent 5.663e-01 1.310e-01 4.323 1.54e-05 ***
## jobtechnician 4.112e-02 6.123e-02 0.672 0.501829
## jobunemployed -8.601e-02 1.158e-01 -0.743 0.457557
## maritalmarried 1.932e-02 5.808e-02 0.333 0.739418
## maritalsingle 2.889e-02 6.498e-02 0.445 0.656666
## educationbasic.6y 2.242e-01 9.402e-02 2.385 0.017084 *
## educationbasic.9y 5.266e-02 7.714e-02 0.683 0.494782
## educationhigh.school 1.504e-01 7.988e-02 1.882 0.059802 .
## educationprofessional.course 1.636e-01 8.853e-02 1.847 0.064685 .
## educationuniversity.degree 3.087e-01 8.106e-02 3.808 0.000140 ***
## defaultrefuse2disclose -2.201e-01 4.953e-02 -4.443 8.86e-06 ***
## housingrefuse2disclose -2.916e-01 1.142e-01 -2.554 0.010650 *
## housingyes -1.092e-01 3.573e-02 -3.058 0.002231 **
## loanrefuse2disclose NA NA NA NA
## loanyes 2.690e-02 4.921e-02 0.547 0.584602
## contacttelephone -6.221e-01 6.255e-02 -9.946 < 2e-16 ***
## monthaug -3.671e-01 9.993e-02 -3.674 0.000239 ***
## monthdec 8.460e-01 3.141e-01 2.693 0.007072 **
## monthjul 1.022e-01 8.570e-02 1.193 0.232830
## monthjun 2.056e-01 8.419e-02 2.443 0.014580 *
## monthmar 1.463e+00 1.643e-01 8.904 < 2e-16 ***
## monthmay -6.175e-01 6.864e-02 -8.997 < 2e-16 ***
## monthnov -4.608e-01 8.642e-02 -5.332 9.73e-08 ***
## monthoct 2.038e+00 2.359e-01 8.641 < 2e-16 ***
## monthsep -1.534e-01 1.670e-01 -0.919 0.358274
## day_of_weekmon -1.874e-01 5.651e-02 -3.317 0.000909 ***
## day_of_weekthu 8.685e-02 5.552e-02 1.564 0.117753
## day_of_weektue -3.579e-02 5.679e-02 -0.630 0.528485
## day_of_weekwed 7.889e-02 5.604e-02 1.408 0.159239
## campaign -6.702e-03 1.458e-02 -0.460 0.645673
## pdays -9.938e-03 1.558e-01 -0.064 0.949128
## previous -1.772e-01 1.035e-01 -1.712 0.086953 .
## poutcomenonexistent 2.709e-01 1.192e-01 2.272 0.023064 *
## poutcomesuccess 2.139e+00 2.129e-01 10.044 < 2e-16 ***
## emp.var.rate -1.177e-01 2.112e-02 -5.573 2.51e-08 ***
## cons.price.idx 4.490e-02 4.467e-02 1.005 0.314859
## cons.conf.idx 2.984e-02 4.826e-03 6.183 6.30e-10 ***
## euribor3m -9.722e-02 1.880e-02 -5.170 2.34e-07 ***
## nr.employed -4.575e-03 4.007e-04 -11.419 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 23813 on 17177 degrees of freedom
## Residual deviance: 19390 on 17131 degrees of freedom
## AIC: 19484
##
## Number of Fisher Scoring iterations: 5
anova(logit_model, test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: y
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 17177 23814
## age 1 8.26 17176 23805 0.004064 **
## job 10 429.62 17166 23376 < 2.2e-16 ***
## marital 2 53.54 17164 23322 2.366e-12 ***
## education 5 92.92 17159 23229 < 2.2e-16 ***
## default 1 289.77 17158 22939 < 2.2e-16 ***
## housing 2 3.76 17156 22936 0.152368
## loan 1 1.00 17155 22935 0.316613
## contact 1 747.52 17154 22187 < 2.2e-16 ***
## month 9 1264.02 17145 20923 < 2.2e-16 ***
## day_of_week 4 31.46 17141 20892 2.462e-06 ***
## campaign 1 4.86 17140 20887 0.027471 *
## pdays 1 428.49 17139 20458 < 2.2e-16 ***
## previous 1 0.10 17138 20458 0.747511
## poutcome 2 160.54 17136 20298 < 2.2e-16 ***
## emp.var.rate 1 590.88 17135 19707 < 2.2e-16 ***
## cons.price.idx 1 8.37 17134 19698 0.003822 **
## cons.conf.idx 1 45.11 17133 19653 1.867e-11 ***
## euribor3m 1 129.84 17132 19523 < 2.2e-16 ***
## nr.employed 1 133.24 17131 19390 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#confusion matrix for train
log.pred.train <-predict(logit_model,data=train_data,type="response")
log.pred1.train <-ifelse(log.pred.train>0.5,1,0)
log.confusion.matrix.train <-table(log.pred1.train,train_data$y)
log.confusion.matrix.train
##
## log.pred1.train 0 1
## 0 6985 3298
## 1 1638 5257
log.accuracy.train=sum(diag(log.confusion.matrix.train))/sum(log.confusion.matrix.train)
log.accuracy.train
## [1] 0.7126557
#confusion matrix for test
log.pred.test <-predict(logit_model,data=test_data,type="response")
log.pred1.test <-ifelse(log.pred.test>0.5,1,0)
error1 <-mean(log.pred1.test !=test_data$y)
## Warning in `!=.default`(log.pred1.test, test_data$y): longer object length
## is not a multiple of shorter object length
## Warning in is.na(e1) | is.na(e2): longer object length is not a multiple of
## shorter object length
print(paste('Accuracy',1-error1))
## [1] "Accuracy 0.712481080451741"
#AUC-ROC curve(later decide whether to keep it or not)
#par(mfrow=c(1,2))
#pred <- prediction(log.pred.train, train_data$y)
#perf <- performance(pred,"tpr","fpr")
#plot(perf, main = "ROC for Logistic with training data", col='darkslategray3')
#abline(0,1)
#pred1 <- prediction(log.pred.test, test_data$y)
#perf1 <- performance(log.pred.test,"tpr","fpr")
#plot(perf1, main = "ROC for logistic with test data", col='darkslategray3')
#abline(0,1)
# AUC.log.train<- auc(roc(train_data$y,log.pred1.train))
#AUC.log.train
#AUC.log.test<- auc(roc(test_data$y,log.pred1.test))
#AUC.log.test
###Decision tree
tree_model <- rpart(y ~ ., data = train_data,method="class")
tree_model
## n= 17178
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 17178 8555 0 (0.50197928 0.49802072)
## 2) pdays< 0.08583835 13731 5299 0 (0.61408492 0.38591508)
## 4) pdays>=-0.08703113 11425 3116 0 (0.72726477 0.27273523)
## 8) euribor3m>=3.128397 7282 1215 0 (0.83315023 0.16684977) *
## 9) euribor3m< 3.128397 4143 1901 0 (0.54115375 0.45884625)
## 18) month=apr,may,nov 2892 1045 0 (0.63865837 0.36134163)
## 36) nr.employed>=5052.667 2498 765 0 (0.69375500 0.30624500) *
## 37) nr.employed< 5052.667 394 114 1 (0.28934010 0.71065990) *
## 19) month=aug,dec,jul,jun,mar,oct,sep 1251 395 1 (0.31574740 0.68425260) *
## 5) pdays< -0.08703113 2306 123 1 (0.05333912 0.94666088) *
## 3) pdays>=0.08583835 3447 191 1 (0.05541050 0.94458950) *
fancyRpartPlot(tree_model)

#predict train
predictions <- predict(tree_model, train_data, type = "class")
#confusion matrix train
tree.confusion.matrix.train <- prop.table(table(predictions, train_data$y))
tree.confusion.matrix.train
##
## predictions 0 1
## 0 0.45406916 0.11526371
## 1 0.04791012 0.38275701
CrossTable(train_data$y, predictions,
prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
dnn = c('actual subscription status', 'predicted subscription status'))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 17178
##
##
## | predicted subscription status
## actual subscription status | 0 | 1 | Row Total |
## ---------------------------|-----------|-----------|-----------|
## 0 | 7800 | 823 | 8623 |
## | 0.454 | 0.048 | |
## ---------------------------|-----------|-----------|-----------|
## 1 | 1980 | 6575 | 8555 |
## | 0.115 | 0.383 | |
## ---------------------------|-----------|-----------|-----------|
## Column Total | 9780 | 7398 | 17178 |
## ---------------------------|-----------|-----------|-----------|
##
##
#train accuracy
tree.accuracy.train=sum(diag(tree.confusion.matrix.train))/sum(tree.confusion.matrix.train)
tree.accuracy.train
## [1] 0.8368262
#predict test
cart_pred <- predict(tree_model , test_data,type="class")
# Confusion matrix for test
tree.confusion.matrix.test <- prop.table(table(cart_pred, test_data$y))
tree.confusion.matrix.test
##
## cart_pred 0 1
## 0 0.4506870 0.1103284
## 1 0.0512925 0.3876921
#test accuracy
tree.accuracy.test=sum(diag(tree.confusion.matrix.test))/sum(tree.confusion.matrix.test)
tree.accuracy.test
## [1] 0.8383791
# Cross table validation for test
CrossTable(test_data$y, cart_pred,
prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
dnn = c('actual customers responses', 'predicted customers responses'))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 17176
##
##
## | predicted customers responses
## actual customers responses | 0 | 1 | Row Total |
## ---------------------------|-----------|-----------|-----------|
## 0 | 7741 | 881 | 8622 |
## | 0.451 | 0.051 | |
## ---------------------------|-----------|-----------|-----------|
## 1 | 1895 | 6659 | 8554 |
## | 0.110 | 0.388 | |
## ---------------------------|-----------|-----------|-----------|
## Column Total | 9636 | 7540 | 17176 |
## ---------------------------|-----------|-----------|-----------|
##
##
##prune tree
set.seed(123)
printcp(tree_model)
##
## Classification tree:
## rpart(formula = y ~ ., data = train_data, method = "class")
##
## Variables actually used in tree construction:
## [1] euribor3m month nr.employed pdays
##
## Root node error: 8555/17178 = 0.49802
##
## n= 17178
##
## CP nsplit rel error xerror xstd
## 1 0.358270 0 1.00000 1.00807 0.0076601
## 2 0.240795 1 0.64173 0.64243 0.0071462
## 3 0.026943 2 0.40094 0.40245 0.0061331
## 4 0.019404 4 0.34705 0.35523 0.0058461
## 5 0.010000 5 0.32764 0.33372 0.0057031
plotcp(tree_model)

tree_model$cptable[which.min(tree_model$cptable[,"xerror"]),"CP"]
## [1] 0.01
bestcp <- tree_model$cptable[which.min(tree_model$cptable[,"xerror"]),"CP"]
tree.pruned <- prune(tree_model, cp = bestcp)
fancyRpartPlot(tree.pruned)

# Compute the train accuracy of the pruned tree
train_data$pred <- predict(tree.pruned, train_data, type = "class")
accuracy_prun_train <- mean(train_data$pred == train_data$y)
accuracy_prun_train
## [1] 0.8368262
pruned.confusion.matrix.train <- prop.table(table(train_data$pred, train_data$y))
pruned.confusion.matrix.train
##
## 0 1
## 0 0.45406916 0.11526371
## 1 0.04791012 0.38275701
# Compute the test accuracy of the pruned tree
test_data$pred <- predict(tree.pruned, test_data, type = "class")
accuracy_prune_test <- mean(test_data$pred== test_data$y)
accuracy_prune_test
## [1] 0.8383791
pruned.confusion.matrix.test <- prop.table(table(test_data$pred, test_data$y))
pruned.confusion.matrix.test
##
## 0 1
## 0 0.4506870 0.1103284
## 1 0.0512925 0.3876921
#The tree after being pruned is the same as before
#AUC-ROC
#par(mfrow=c(1,2))
#pred3 <- prediction(as.numeric(predictions), as.numeric(train_data$y))
#perf3 <- performance(pred3,"tpr","fpr")
#plot(perf3,main = "ROC for Tree with training data", col='darkslategray3')
#abline(0,1)
#pred4 <- prediction(as.numeric(cart_pred), as.numeric(test_data$y))
#perf4 <- performance(pred4,"tpr","fpr")
#plot(perf4,main = "ROC for Tree with test data", col='darkslategray3')
#abline(0,1)
#auc_tree_train<- auc(roc(as.numeric(train_data$y), as.numeric(predictions)))
#auc_tree_train
#auc_tree_test <- auc(roc(as.numeric(test_data$y), as.numeric(cart_pred)))
#auc_tree_test
#==============================================================
#neural nets
library(nnet)
library(NeuralNetTools)
library(neuralnet)
##
## Attaching package: 'neuralnet'
## The following object is masked from 'package:ROCR':
##
## prediction
## The following object is masked from 'package:dplyr':
##
## compute
set.seed(888)
nn <- train(y ~ .,
data = train_data,
method = "nnet")
## # weights: 53
## initial value 13371.280674
## final value 11904.853547
## converged
## # weights: 157
## initial value 11922.455211
## final value 11904.853547
## converged
## # weights: 261
## initial value 12538.436480
## final value 11904.853547
## converged
## # weights: 53
## initial value 11926.377996
## iter 10 value 11903.523664
## iter 20 value 11403.173730
## iter 30 value 10592.551849
## iter 40 value 8822.282665
## iter 50 value 7481.891610
## iter 60 value 7093.933701
## iter 70 value 7043.888673
## iter 80 value 7036.264710
## iter 90 value 7032.114368
## iter 100 value 7002.851963
## final value 7002.851963
## stopped after 100 iterations
## # weights: 157
## initial value 11923.059630
## iter 10 value 11904.857099
## iter 20 value 11884.068833
## iter 30 value 10841.625259
## iter 40 value 8556.689789
## iter 50 value 7330.485416
## iter 60 value 7192.403274
## iter 70 value 7152.962919
## iter 80 value 7104.814746
## iter 90 value 7070.634083
## iter 100 value 7030.794184
## final value 7030.794184
## stopped after 100 iterations
## # weights: 261
## initial value 16397.080413
## iter 10 value 11904.867274
## iter 20 value 11904.780955
## iter 30 value 11902.511036
## iter 40 value 11527.437450
## iter 50 value 10922.375413
## iter 60 value 10835.814753
## iter 70 value 9628.930987
## iter 80 value 7412.770301
## iter 90 value 7155.281641
## iter 100 value 7012.390201
## final value 7012.390201
## stopped after 100 iterations
## # weights: 53
## initial value 12051.385954
## final value 11904.854532
## converged
## # weights: 157
## initial value 11921.087966
## final value 11904.856192
## converged
## # weights: 261
## initial value 13462.633566
## final value 11904.858062
## converged
## # weights: 53
## initial value 12438.145104
## final value 11905.549249
## converged
## # weights: 157
## initial value 12899.458187
## final value 11905.549249
## converged
## # weights: 261
## initial value 14101.505302
## final value 11905.549249
## converged
## # weights: 53
## initial value 13247.038648
## iter 10 value 11905.035835
## iter 20 value 11890.711499
## iter 30 value 11814.584646
## iter 40 value 11637.972564
## iter 50 value 11242.934602
## iter 60 value 10769.400613
## iter 70 value 10149.245549
## iter 80 value 8406.985171
## iter 90 value 7751.012655
## iter 100 value 7348.360292
## final value 7348.360292
## stopped after 100 iterations
## # weights: 157
## initial value 12797.103858
## iter 10 value 11905.543517
## iter 20 value 11904.898617
## iter 30 value 11577.377113
## iter 40 value 11302.563479
## iter 50 value 11068.084614
## iter 60 value 10375.662490
## iter 70 value 9608.801945
## iter 80 value 8720.792357
## iter 90 value 7677.054674
## iter 100 value 7594.182718
## final value 7594.182718
## stopped after 100 iterations
## # weights: 261
## initial value 12197.265452
## iter 10 value 11900.703103
## iter 20 value 11552.170856
## iter 30 value 10845.628190
## iter 40 value 10449.540319
## iter 50 value 8909.638226
## iter 60 value 8069.561773
## iter 70 value 7612.100487
## iter 80 value 7362.173558
## iter 90 value 7279.984175
## iter 100 value 7247.870234
## final value 7247.870234
## stopped after 100 iterations
## # weights: 53
## initial value 12438.077656
## final value 11905.550095
## converged
## # weights: 157
## initial value 11968.328805
## final value 11905.552126
## converged
## # weights: 261
## initial value 12745.245106
## final value 11905.553257
## converged
## # weights: 53
## initial value 12770.846520
## final value 11906.278698
## converged
## # weights: 157
## initial value 11932.748834
## final value 11906.278698
## converged
## # weights: 261
## initial value 12667.944088
## final value 11906.278698
## converged
## # weights: 53
## initial value 12833.077204
## final value 11906.279243
## converged
## # weights: 157
## initial value 12290.772490
## iter 10 value 11906.281718
## iter 20 value 11906.107182
## iter 30 value 10457.810517
## iter 40 value 10106.838512
## iter 50 value 9184.360300
## iter 60 value 8990.970320
## iter 70 value 7895.208422
## iter 80 value 7421.777703
## iter 90 value 7166.124354
## iter 100 value 7085.873797
## final value 7085.873797
## stopped after 100 iterations
## # weights: 261
## initial value 16828.059567
## iter 10 value 11906.110222
## iter 20 value 11891.285568
## iter 30 value 11848.564417
## iter 40 value 10076.026511
## iter 50 value 7912.376697
## iter 60 value 7520.351081
## iter 70 value 7239.962623
## iter 80 value 7065.978303
## iter 90 value 6982.235930
## iter 100 value 6935.806907
## final value 6935.806907
## stopped after 100 iterations
## # weights: 53
## initial value 12450.613400
## final value 11906.279343
## converged
## # weights: 157
## initial value 11919.673642
## final value 11906.281264
## converged
## # weights: 261
## initial value 11920.101347
## final value 11906.282925
## converged
## # weights: 53
## initial value 12032.192383
## final value 11906.731377
## converged
## # weights: 157
## initial value 13057.775733
## final value 11906.731377
## converged
## # weights: 261
## initial value 11918.950550
## final value 11906.731377
## converged
## # weights: 53
## initial value 11955.986279
## iter 10 value 11906.579524
## iter 20 value 11859.408545
## iter 30 value 10584.555385
## iter 40 value 9507.106943
## iter 50 value 8470.370494
## iter 60 value 7495.951733
## iter 70 value 7094.722711
## iter 80 value 7043.442156
## iter 90 value 7024.056373
## iter 100 value 7015.136803
## final value 7015.136803
## stopped after 100 iterations
## # weights: 157
## initial value 12128.318771
## iter 10 value 11906.582219
## iter 20 value 11868.447862
## iter 30 value 10901.820722
## iter 40 value 9631.237721
## iter 50 value 8475.922734
## iter 60 value 7680.412082
## iter 70 value 7225.264064
## iter 80 value 6933.382715
## iter 90 value 6795.254967
## iter 100 value 6711.636656
## final value 6711.636656
## stopped after 100 iterations
## # weights: 261
## initial value 12246.003305
## iter 10 value 11903.671350
## iter 20 value 11821.106192
## iter 30 value 11024.470587
## iter 40 value 9236.008682
## iter 50 value 8155.009940
## iter 60 value 7282.918018
## iter 70 value 7192.281481
## iter 80 value 7122.010104
## iter 90 value 7071.315239
## iter 100 value 7006.721210
## final value 7006.721210
## stopped after 100 iterations
## # weights: 53
## initial value 13566.927379
## final value 11906.732306
## converged
## # weights: 157
## initial value 11909.576728
## final value 11906.733873
## converged
## # weights: 261
## initial value 11926.223994
## final value 11906.735731
## converged
## # weights: 53
## initial value 12079.285688
## final value 11906.463123
## converged
## # weights: 157
## initial value 11960.152086
## final value 11906.463123
## converged
## # weights: 261
## initial value 11913.996327
## final value 11906.463123
## converged
## # weights: 53
## initial value 14074.636587
## iter 10 value 11899.057611
## iter 20 value 11847.861975
## iter 30 value 11764.361376
## iter 40 value 11043.836462
## iter 50 value 10215.926309
## iter 60 value 9939.773030
## iter 70 value 9733.401441
## iter 80 value 9415.398494
## iter 90 value 9246.370891
## iter 100 value 9136.333166
## final value 9136.333166
## stopped after 100 iterations
## # weights: 157
## initial value 11960.938163
## iter 10 value 11906.396833
## iter 20 value 11652.315700
## iter 30 value 10653.528466
## iter 40 value 8701.192865
## iter 50 value 8034.603201
## iter 60 value 7397.186456
## iter 70 value 7194.289619
## iter 80 value 7106.186297
## iter 90 value 7029.173095
## iter 100 value 6976.720457
## final value 6976.720457
## stopped after 100 iterations
## # weights: 261
## initial value 11914.551547
## iter 10 value 11906.387890
## iter 20 value 11903.793697
## iter 30 value 10787.741283
## iter 40 value 7810.993015
## iter 50 value 7507.754470
## iter 60 value 7420.253957
## iter 70 value 7359.758160
## iter 80 value 7247.058728
## iter 90 value 7107.692996
## iter 100 value 7059.041260
## final value 7059.041260
## stopped after 100 iterations
## # weights: 53
## initial value 12760.450425
## final value 11906.464025
## converged
## # weights: 157
## initial value 12505.888769
## final value 11906.465445
## converged
## # weights: 261
## initial value 12019.640189
## final value 11906.467178
## converged
## # weights: 53
## initial value 11942.344805
## final value 11905.939184
## converged
## # weights: 157
## initial value 12531.698931
## final value 11905.939184
## converged
## # weights: 261
## initial value 12224.460111
## final value 11905.939184
## converged
## # weights: 53
## initial value 12434.194716
## iter 10 value 11905.909868
## iter 20 value 9488.493950
## iter 30 value 7769.756809
## iter 40 value 7312.110180
## iter 50 value 7225.267161
## iter 60 value 7195.331725
## iter 70 value 7163.362029
## iter 80 value 7149.689917
## iter 90 value 7117.908482
## iter 100 value 7075.182650
## final value 7075.182650
## stopped after 100 iterations
## # weights: 157
## initial value 11913.858795
## iter 10 value 11905.948857
## iter 20 value 11905.767062
## iter 30 value 11893.937597
## iter 40 value 11707.051620
## iter 50 value 11254.281196
## iter 60 value 10945.389072
## iter 70 value 10562.496001
## iter 80 value 9209.081686
## iter 90 value 7664.043475
## iter 100 value 7567.794581
## final value 7567.794581
## stopped after 100 iterations
## # weights: 261
## initial value 11982.591215
## iter 10 value 11905.768213
## iter 20 value 11833.756355
## iter 30 value 10651.051954
## iter 40 value 9677.835071
## iter 50 value 7588.213045
## iter 60 value 7440.733511
## iter 70 value 7318.085141
## iter 80 value 7309.734959
## iter 90 value 7299.484355
## iter 100 value 7238.072140
## final value 7238.072140
## stopped after 100 iterations
## # weights: 53
## initial value 12699.862470
## final value 11905.940016
## converged
## # weights: 157
## initial value 11955.721752
## final value 11905.941522
## converged
## # weights: 261
## initial value 11907.826694
## final value 11905.943384
## converged
## # weights: 53
## initial value 11907.718187
## final value 11905.524216
## converged
## # weights: 157
## initial value 12179.034698
## final value 11905.524216
## converged
## # weights: 261
## initial value 13368.748053
## final value 11905.524216
## converged
## # weights: 53
## initial value 12290.819165
## iter 10 value 11905.502819
## iter 20 value 11852.180841
## iter 30 value 10593.841336
## iter 40 value 9226.494186
## iter 50 value 8389.023618
## iter 60 value 8275.085316
## iter 70 value 7320.231438
## iter 80 value 7162.964814
## iter 90 value 7120.400329
## iter 100 value 7111.601812
## final value 7111.601812
## stopped after 100 iterations
## # weights: 157
## initial value 14374.646606
## iter 10 value 11905.036721
## iter 20 value 11904.428434
## iter 30 value 11532.203209
## iter 40 value 10560.325079
## iter 50 value 10235.116345
## iter 60 value 10088.095662
## iter 70 value 9724.776028
## iter 80 value 9220.660287
## iter 90 value 8459.680058
## iter 100 value 7702.683499
## final value 7702.683499
## stopped after 100 iterations
## # weights: 261
## initial value 12777.988057
## iter 10 value 11905.192425
## iter 20 value 11569.632660
## iter 30 value 10780.171019
## iter 40 value 8448.906519
## iter 50 value 7631.108172
## iter 60 value 7523.133813
## iter 70 value 7371.167838
## iter 80 value 7142.157696
## iter 90 value 7058.557071
## iter 100 value 6991.046694
## final value 6991.046694
## stopped after 100 iterations
## # weights: 53
## initial value 15146.840094
## final value 11905.524988
## converged
## # weights: 157
## initial value 15379.578590
## final value 11905.526543
## converged
## # weights: 261
## initial value 12458.555030
## final value 11905.528372
## converged
## # weights: 53
## initial value 11921.563525
## final value 11905.205650
## converged
## # weights: 157
## initial value 12814.629987
## final value 11905.205650
## converged
## # weights: 261
## initial value 12575.735968
## final value 11905.205650
## converged
## # weights: 53
## initial value 11907.804027
## iter 10 value 11905.208334
## final value 11905.205719
## converged
## # weights: 157
## initial value 12703.677703
## iter 10 value 11905.214792
## iter 20 value 11904.756595
## iter 30 value 11755.847038
## iter 40 value 11741.585923
## iter 50 value 11273.738788
## iter 60 value 9909.944002
## iter 70 value 8333.144216
## iter 80 value 7604.081003
## iter 90 value 7270.397781
## iter 100 value 7099.773414
## final value 7099.773414
## stopped after 100 iterations
## # weights: 261
## initial value 14894.053460
## iter 10 value 11905.202112
## iter 20 value 11902.502354
## iter 30 value 11809.120543
## iter 40 value 10998.692213
## iter 50 value 7674.980557
## iter 60 value 7509.239638
## iter 70 value 7414.561177
## iter 80 value 7184.287863
## iter 90 value 7059.860094
## iter 100 value 6928.616002
## final value 6928.616002
## stopped after 100 iterations
## # weights: 53
## initial value 11938.743487
## final value 11905.206494
## converged
## # weights: 157
## initial value 13018.214946
## final value 11905.207854
## converged
## # weights: 261
## initial value 12416.071459
## final value 11905.210075
## converged
## # weights: 53
## initial value 12238.292670
## final value 11906.517147
## converged
## # weights: 157
## initial value 13175.771246
## final value 11906.517147
## converged
## # weights: 261
## initial value 12446.614366
## final value 11906.517147
## converged
## # weights: 53
## initial value 12247.894705
## iter 10 value 11906.264529
## iter 20 value 11853.819380
## iter 30 value 10986.062254
## iter 40 value 9762.045804
## iter 50 value 9114.067298
## iter 60 value 7935.441116
## iter 70 value 7304.253481
## iter 80 value 7130.926194
## iter 90 value 7088.658184
## iter 100 value 7045.891747
## final value 7045.891747
## stopped after 100 iterations
## # weights: 157
## initial value 12671.959062
## iter 10 value 11906.495450
## iter 20 value 11827.396148
## iter 30 value 10861.510571
## iter 40 value 7623.787755
## iter 50 value 7333.499226
## iter 60 value 7221.085899
## iter 70 value 7144.234767
## iter 80 value 7111.196619
## iter 90 value 7099.605868
## iter 100 value 7095.496964
## final value 7095.496964
## stopped after 100 iterations
## # weights: 261
## initial value 11912.092092
## iter 10 value 11904.386401
## iter 20 value 11639.150357
## iter 30 value 11218.547909
## iter 40 value 9573.034022
## iter 50 value 8857.471134
## iter 60 value 7964.101568
## iter 70 value 7394.646379
## iter 80 value 7230.914157
## iter 90 value 7157.074051
## iter 100 value 7122.122594
## final value 7122.122594
## stopped after 100 iterations
## # weights: 53
## initial value 11926.825561
## final value 11906.518186
## converged
## # weights: 157
## initial value 11964.200413
## final value 11906.519908
## converged
## # weights: 261
## initial value 13516.865754
## final value 11906.521593
## converged
## # weights: 53
## initial value 13173.780303
## final value 11906.542761
## converged
## # weights: 157
## initial value 12028.729840
## final value 11906.542761
## converged
## # weights: 261
## initial value 11911.435918
## final value 11906.542761
## converged
## # weights: 53
## initial value 12189.976865
## final value 11906.542767
## converged
## # weights: 157
## initial value 12446.762420
## iter 10 value 11906.454041
## iter 20 value 11109.957093
## iter 30 value 10825.228784
## iter 40 value 10051.067425
## iter 50 value 8972.904263
## iter 60 value 8158.014036
## iter 70 value 7684.265425
## iter 80 value 7445.718520
## iter 90 value 7208.466511
## iter 100 value 7190.107309
## final value 7190.107309
## stopped after 100 iterations
## # weights: 261
## initial value 12593.155907
## iter 10 value 11906.534736
## iter 20 value 11533.527198
## iter 30 value 10539.986797
## iter 40 value 9441.192327
## iter 50 value 7768.278947
## iter 60 value 7494.960564
## iter 70 value 7418.455492
## iter 80 value 7253.621284
## iter 90 value 7186.942531
## iter 100 value 7177.292252
## final value 7177.292252
## stopped after 100 iterations
## # weights: 53
## initial value 11933.330091
## final value 11906.543853
## converged
## # weights: 157
## initial value 12765.749541
## final value 11906.545187
## converged
## # weights: 261
## initial value 18003.372264
## final value 11906.546693
## converged
## # weights: 53
## initial value 12346.444469
## final value 11906.852462
## converged
## # weights: 157
## initial value 12610.300290
## final value 11906.852462
## converged
## # weights: 261
## initial value 13921.584586
## final value 11906.852462
## converged
## # weights: 53
## initial value 12198.945572
## iter 10 value 11875.707774
## iter 20 value 11025.097409
## iter 30 value 10107.015291
## iter 40 value 8570.447634
## iter 50 value 7653.155540
## iter 60 value 7414.704549
## iter 70 value 7272.916863
## iter 80 value 7211.995456
## iter 90 value 7172.010451
## iter 100 value 7147.086163
## final value 7147.086163
## stopped after 100 iterations
## # weights: 157
## initial value 11909.520226
## iter 10 value 11906.778866
## iter 20 value 10720.693194
## iter 30 value 9452.753810
## iter 40 value 8521.876719
## iter 50 value 8029.242070
## iter 60 value 7683.968916
## iter 70 value 7501.982754
## iter 80 value 7296.944919
## iter 90 value 7137.733178
## iter 100 value 7082.624490
## final value 7082.624490
## stopped after 100 iterations
## # weights: 261
## initial value 12925.751638
## iter 10 value 11906.730235
## iter 20 value 11704.036274
## iter 30 value 10224.258781
## iter 40 value 9412.820705
## iter 50 value 8291.407366
## iter 60 value 7546.961264
## iter 70 value 7459.750581
## iter 80 value 7386.657974
## iter 90 value 7250.654620
## iter 100 value 7095.115398
## final value 7095.115398
## stopped after 100 iterations
## # weights: 53
## initial value 13099.743440
## final value 11906.853460
## converged
## # weights: 157
## initial value 12004.423032
## final value 11906.855157
## converged
## # weights: 261
## initial value 12087.399816
## final value 11906.856934
## converged
## # weights: 53
## initial value 11994.204091
## final value 11906.797392
## converged
## # weights: 157
## initial value 11906.817579
## final value 11906.797392
## converged
## # weights: 261
## initial value 12109.007571
## final value 11906.797392
## converged
## # weights: 53
## initial value 11916.475807
## iter 10 value 11906.617712
## iter 20 value 11203.821234
## iter 30 value 9656.219210
## iter 40 value 8034.452514
## iter 50 value 7369.184651
## iter 60 value 7252.769624
## iter 70 value 7202.103433
## iter 80 value 7186.531414
## iter 90 value 7185.101899
## iter 100 value 7183.923347
## final value 7183.923347
## stopped after 100 iterations
## # weights: 157
## initial value 13152.148891
## iter 10 value 11906.800687
## iter 20 value 11868.769462
## iter 30 value 11060.168756
## iter 40 value 9373.264532
## iter 50 value 8695.223185
## iter 60 value 8244.502376
## iter 70 value 7562.893400
## iter 80 value 7353.821598
## iter 90 value 7304.901748
## iter 100 value 7254.578970
## final value 7254.578970
## stopped after 100 iterations
## # weights: 261
## initial value 11993.946562
## iter 10 value 11906.668505
## iter 20 value 11653.560645
## iter 30 value 9695.611118
## iter 40 value 7930.697720
## iter 50 value 7510.244893
## iter 60 value 7327.821621
## iter 70 value 7267.612416
## iter 80 value 7200.617422
## iter 90 value 7187.045995
## iter 100 value 7184.604900
## final value 7184.604900
## stopped after 100 iterations
## # weights: 53
## initial value 12393.786764
## final value 11906.798089
## converged
## # weights: 157
## initial value 12393.746939
## final value 11906.799674
## converged
## # weights: 261
## initial value 12559.772436
## final value 11906.801664
## converged
## # weights: 53
## initial value 12631.774938
## final value 11906.747677
## converged
## # weights: 157
## initial value 12424.935807
## final value 11906.747677
## converged
## # weights: 261
## initial value 14309.974441
## final value 11906.747677
## converged
## # weights: 53
## initial value 12198.144833
## final value 11906.747682
## converged
## # weights: 157
## initial value 13687.349866
## iter 10 value 11906.701067
## iter 20 value 11902.278350
## iter 30 value 11120.163016
## iter 40 value 9408.858230
## iter 50 value 8585.654638
## iter 60 value 7565.082349
## iter 70 value 7430.839353
## iter 80 value 7295.344377
## iter 90 value 7247.754630
## iter 100 value 7224.929875
## final value 7224.929875
## stopped after 100 iterations
## # weights: 261
## initial value 12446.843443
## final value 11906.748087
## converged
## # weights: 53
## initial value 11910.496028
## final value 11906.748419
## converged
## # weights: 157
## initial value 12107.811267
## final value 11906.750785
## converged
## # weights: 261
## initial value 12023.902906
## final value 11906.751852
## converged
## # weights: 53
## initial value 11988.679355
## final value 11905.233478
## converged
## # weights: 157
## initial value 11958.257860
## final value 11905.233478
## converged
## # weights: 261
## initial value 12970.820887
## final value 11905.233478
## converged
## # weights: 53
## initial value 11932.926246
## final value 11905.234109
## converged
## # weights: 157
## initial value 12317.451499
## iter 10 value 11905.170051
## iter 20 value 11738.398225
## iter 30 value 11231.419068
## iter 40 value 8382.671968
## iter 50 value 7507.940251
## iter 60 value 7288.198960
## iter 70 value 7210.071496
## iter 80 value 7178.355655
## iter 90 value 7173.486796
## iter 100 value 7155.956920
## final value 7155.956920
## stopped after 100 iterations
## # weights: 261
## initial value 12478.025452
## iter 10 value 11905.226339
## iter 20 value 11885.304261
## iter 30 value 10680.616548
## iter 40 value 10205.460266
## iter 50 value 8442.441808
## iter 60 value 7782.140694
## iter 70 value 7465.746995
## iter 80 value 7255.522414
## iter 90 value 7191.232962
## iter 100 value 7103.492980
## final value 7103.492980
## stopped after 100 iterations
## # weights: 53
## initial value 11913.349075
## final value 11905.234163
## converged
## # weights: 157
## initial value 14141.735260
## final value 11905.236309
## converged
## # weights: 261
## initial value 12207.972586
## final value 11905.237821
## converged
## # weights: 53
## initial value 12137.857538
## final value 11906.770380
## converged
## # weights: 157
## initial value 11999.369656
## final value 11906.770380
## converged
## # weights: 261
## initial value 11919.029016
## final value 11906.770380
## converged
## # weights: 53
## initial value 12208.058546
## final value 11906.770388
## converged
## # weights: 157
## initial value 11946.247709
## iter 10 value 11891.018005
## iter 20 value 11323.172976
## iter 30 value 10583.301615
## iter 40 value 10331.613780
## iter 50 value 8920.025815
## iter 60 value 7759.301686
## iter 70 value 7361.641406
## iter 80 value 7265.694300
## iter 90 value 7169.593424
## iter 100 value 7137.287630
## final value 7137.287630
## stopped after 100 iterations
## # weights: 261
## initial value 12708.397514
## iter 10 value 11906.712164
## iter 20 value 11802.032574
## iter 30 value 9992.531731
## iter 40 value 9078.248760
## iter 50 value 7846.234351
## iter 60 value 7418.870588
## iter 70 value 7225.832734
## iter 80 value 7151.783839
## iter 90 value 7072.161614
## iter 100 value 7044.157927
## final value 7044.157927
## stopped after 100 iterations
## # weights: 53
## initial value 11907.060013
## final value 11906.771139
## converged
## # weights: 157
## initial value 13907.823722
## final value 11906.772729
## converged
## # weights: 261
## initial value 12040.427011
## final value 11906.774819
## converged
## # weights: 53
## initial value 11909.305143
## final value 11906.656862
## converged
## # weights: 157
## initial value 11906.801277
## final value 11906.656862
## converged
## # weights: 261
## initial value 14237.139511
## final value 11906.656862
## converged
## # weights: 53
## initial value 14383.209255
## iter 10 value 11906.562382
## iter 20 value 10821.896092
## iter 30 value 9438.746887
## iter 40 value 7825.113943
## iter 50 value 7398.722152
## iter 60 value 7209.257325
## iter 70 value 7150.472450
## iter 80 value 7138.172426
## iter 90 value 7124.636937
## iter 100 value 7118.186528
## final value 7118.186528
## stopped after 100 iterations
## # weights: 157
## initial value 12846.437976
## iter 10 value 11906.606201
## iter 20 value 11869.806528
## iter 30 value 10808.431929
## iter 40 value 10089.442760
## iter 50 value 7928.782160
## iter 60 value 7528.913169
## iter 70 value 7368.543661
## iter 80 value 7319.269579
## iter 90 value 7284.970010
## iter 100 value 7193.783877
## final value 7193.783877
## stopped after 100 iterations
## # weights: 261
## initial value 12333.872591
## iter 10 value 11906.667457
## iter 20 value 11906.510611
## iter 30 value 11898.632720
## iter 40 value 10264.215099
## iter 50 value 9193.339456
## iter 60 value 8576.342368
## iter 70 value 7879.120939
## iter 80 value 7467.537496
## iter 90 value 7219.879428
## iter 100 value 7128.427135
## final value 7128.427135
## stopped after 100 iterations
## # weights: 53
## initial value 12135.082821
## final value 11906.657556
## converged
## # weights: 157
## initial value 14792.591123
## final value 11906.659309
## converged
## # weights: 261
## initial value 15334.560078
## final value 11906.660998
## converged
## # weights: 53
## initial value 12058.401240
## final value 11906.614016
## converged
## # weights: 157
## initial value 11924.995615
## final value 11906.614016
## converged
## # weights: 261
## initial value 15941.377331
## final value 11906.614016
## converged
## # weights: 53
## initial value 13474.681036
## final value 11906.614036
## converged
## # weights: 157
## initial value 11910.093425
## final value 11906.614558
## converged
## # weights: 261
## initial value 12082.291085
## iter 10 value 11906.625523
## iter 20 value 11906.524469
## iter 30 value 11868.567740
## iter 40 value 11500.553356
## iter 50 value 10631.255513
## iter 60 value 9059.798386
## iter 70 value 8119.476075
## iter 80 value 7647.045378
## iter 90 value 7384.969607
## iter 100 value 7131.727892
## final value 7131.727892
## stopped after 100 iterations
## # weights: 53
## initial value 11906.643623
## final value 11906.614848
## converged
## # weights: 157
## initial value 12468.747634
## final value 11906.616443
## converged
## # weights: 261
## initial value 12289.112922
## final value 11906.618679
## converged
## # weights: 53
## initial value 13043.483062
## final value 11906.542761
## converged
## # weights: 157
## initial value 12868.308374
## final value 11906.542761
## converged
## # weights: 261
## initial value 16256.467850
## final value 11906.542761
## converged
## # weights: 53
## initial value 12768.633952
## iter 10 value 11906.543923
## final value 11906.542796
## converged
## # weights: 157
## initial value 14695.900902
## iter 10 value 11906.479897
## iter 20 value 11867.540866
## iter 30 value 11460.572589
## iter 40 value 10373.637772
## iter 50 value 7928.117509
## iter 60 value 7545.726760
## iter 70 value 7533.172782
## iter 80 value 7400.152258
## iter 90 value 7173.022606
## iter 100 value 7087.085666
## final value 7087.085666
## stopped after 100 iterations
## # weights: 261
## initial value 11970.114742
## iter 10 value 11906.282720
## iter 20 value 11889.018748
## iter 30 value 10959.841405
## iter 40 value 9997.210037
## iter 50 value 9066.689399
## iter 60 value 7871.062676
## iter 70 value 7451.592203
## iter 80 value 7317.838483
## iter 90 value 7095.768110
## iter 100 value 6849.448508
## final value 6849.448508
## stopped after 100 iterations
## # weights: 53
## initial value 11993.055710
## final value 11906.543452
## converged
## # weights: 157
## initial value 13260.669378
## final value 11906.545458
## converged
## # weights: 261
## initial value 12156.309397
## final value 11906.546670
## converged
## # weights: 53
## initial value 11909.890924
## final value 11906.868180
## converged
## # weights: 157
## initial value 14200.861108
## final value 11906.868180
## converged
## # weights: 261
## initial value 12073.207069
## final value 11906.868180
## converged
## # weights: 53
## initial value 12139.185528
## iter 10 value 11906.573465
## iter 20 value 11113.802196
## iter 30 value 10308.140762
## iter 40 value 8384.682560
## iter 50 value 7480.557100
## iter 60 value 7373.356563
## iter 70 value 7326.319175
## iter 80 value 7287.160738
## iter 90 value 7188.210091
## iter 100 value 7120.663703
## final value 7120.663703
## stopped after 100 iterations
## # weights: 157
## initial value 11939.525145
## iter 10 value 11903.321151
## iter 20 value 11420.249871
## iter 30 value 10478.878436
## iter 40 value 8503.259206
## iter 50 value 7344.381464
## iter 60 value 7178.865463
## iter 70 value 7152.176250
## iter 80 value 7134.627879
## iter 90 value 7114.287566
## iter 100 value 7109.421197
## final value 7109.421197
## stopped after 100 iterations
## # weights: 261
## initial value 12113.220175
## final value 11906.868957
## converged
## # weights: 53
## initial value 12002.140121
## final value 11906.869089
## converged
## # weights: 157
## initial value 12111.500772
## final value 11906.870697
## converged
## # weights: 261
## initial value 12481.913796
## final value 11906.872397
## converged
## # weights: 53
## initial value 12955.562951
## final value 11906.862591
## converged
## # weights: 157
## initial value 11926.543342
## final value 11906.862591
## converged
## # weights: 261
## initial value 14171.148724
## final value 11906.862591
## converged
## # weights: 53
## initial value 12343.593179
## iter 10 value 11906.811657
## iter 20 value 11835.195685
## iter 30 value 11249.166393
## iter 40 value 8132.431282
## iter 50 value 7308.225356
## iter 60 value 7167.154073
## iter 70 value 7025.468634
## iter 80 value 6959.137751
## iter 90 value 6928.759912
## iter 100 value 6923.881761
## final value 6923.881761
## stopped after 100 iterations
## # weights: 157
## initial value 12025.550112
## iter 10 value 11904.009958
## iter 20 value 11854.869940
## iter 30 value 11312.348379
## iter 40 value 9015.939581
## iter 50 value 7389.520668
## iter 60 value 7239.016612
## iter 70 value 7173.487695
## iter 80 value 7019.213558
## iter 90 value 6961.045295
## iter 100 value 6948.179559
## final value 6948.179559
## stopped after 100 iterations
## # weights: 261
## initial value 11931.344666
## iter 10 value 11906.759531
## iter 20 value 11883.386071
## iter 30 value 11838.144178
## iter 40 value 11134.272357
## iter 50 value 10631.585484
## iter 60 value 10248.982978
## iter 70 value 8642.576053
## iter 80 value 7620.518950
## iter 90 value 7366.337422
## iter 100 value 7071.885139
## final value 7071.885139
## stopped after 100 iterations
## # weights: 53
## initial value 11914.714825
## final value 11906.863475
## converged
## # weights: 157
## initial value 12694.420330
## final value 11906.864873
## converged
## # weights: 261
## initial value 12199.303674
## final value 11906.866931
## converged
## # weights: 53
## initial value 12391.946945
## final value 11906.311764
## converged
## # weights: 157
## initial value 12583.338666
## final value 11906.311764
## converged
## # weights: 261
## initial value 12346.489618
## final value 11906.311764
## converged
## # weights: 53
## initial value 12234.045338
## iter 10 value 11906.170995
## iter 20 value 11904.844328
## iter 30 value 11710.679637
## iter 40 value 11126.666049
## iter 50 value 10570.152664
## iter 60 value 10442.631279
## iter 70 value 10213.714087
## iter 80 value 9306.175588
## iter 90 value 7707.272339
## iter 100 value 7295.717878
## final value 7295.717878
## stopped after 100 iterations
## # weights: 157
## initial value 11908.986393
## iter 10 value 11906.269507
## iter 10 value 11906.269450
## iter 20 value 11904.925954
## iter 30 value 10537.167300
## iter 40 value 8759.816028
## iter 50 value 7807.240454
## iter 60 value 7214.960445
## iter 70 value 7114.550008
## iter 80 value 7050.845185
## iter 90 value 6948.745246
## iter 100 value 6892.542127
## final value 6892.542127
## stopped after 100 iterations
## # weights: 261
## initial value 12019.846559
## iter 10 value 11906.314401
## iter 20 value 11906.074804
## iter 30 value 11072.930183
## iter 40 value 10023.614185
## iter 50 value 8248.222887
## iter 60 value 7356.468001
## iter 70 value 7181.839721
## iter 80 value 7137.070689
## iter 90 value 7119.295929
## iter 100 value 7093.917529
## final value 7093.917529
## stopped after 100 iterations
## # weights: 53
## initial value 12121.578810
## final value 11906.312775
## converged
## # weights: 157
## initial value 13490.851945
## final value 11906.314329
## converged
## # weights: 261
## initial value 13472.673749
## final value 11906.316068
## converged
## # weights: 53
## initial value 11908.666194
## final value 11906.879357
## converged
## # weights: 157
## initial value 12295.941545
## final value 11906.879357
## converged
## # weights: 261
## initial value 11926.419273
## final value 11906.879357
## converged
## # weights: 53
## initial value 12107.699563
## final value 11906.879357
## converged
## # weights: 157
## initial value 11953.221183
## iter 10 value 11906.837787
## iter 20 value 11881.883496
## iter 30 value 10927.541103
## iter 40 value 10704.517216
## iter 50 value 8190.611325
## iter 60 value 7501.097085
## iter 70 value 7365.122629
## iter 80 value 7205.268370
## iter 90 value 7144.364065
## iter 100 value 7131.820294
## final value 7131.820294
## stopped after 100 iterations
## # weights: 261
## initial value 12818.088846
## iter 10 value 11906.838267
## iter 20 value 11718.479665
## iter 30 value 11061.702156
## iter 40 value 8380.914670
## iter 50 value 7548.816411
## iter 60 value 7468.129757
## iter 70 value 7383.188811
## iter 80 value 7214.113877
## iter 90 value 6994.789583
## iter 100 value 6959.159471
## final value 6959.159471
## stopped after 100 iterations
## # weights: 53
## initial value 12072.390224
## final value 11906.880250
## converged
## # weights: 157
## initial value 12295.861327
## final value 11906.882058
## converged
## # weights: 261
## initial value 14427.844131
## final value 11906.883663
## converged
## # weights: 53
## initial value 11906.953600
## final value 11906.490601
## converged
## # weights: 157
## initial value 14909.172882
## final value 11906.490601
## converged
## # weights: 261
## initial value 13484.895920
## final value 11906.490601
## converged
## # weights: 53
## initial value 12084.714383
## final value 11906.490619
## converged
## # weights: 157
## initial value 12194.364449
## final value 11906.490618
## converged
## # weights: 261
## initial value 11923.347570
## iter 10 value 11906.352493
## iter 20 value 11873.659849
## iter 30 value 10834.256922
## iter 40 value 8178.244065
## iter 50 value 7780.471769
## iter 60 value 7654.955001
## iter 70 value 7577.982627
## iter 80 value 7386.256369
## iter 90 value 7145.879407
## iter 100 value 6840.442445
## final value 6840.442445
## stopped after 100 iterations
## # weights: 53
## initial value 12525.929612
## final value 11906.491398
## converged
## # weights: 157
## initial value 13645.829819
## final value 11906.493223
## converged
## # weights: 261
## initial value 13975.693125
## final value 11906.494811
## converged
## # weights: 53
## initial value 12759.360209
## final value 11906.731377
## converged
## # weights: 157
## initial value 12417.731575
## final value 11906.731377
## converged
## # weights: 261
## initial value 14642.443820
## final value 11906.731377
## converged
## # weights: 53
## initial value 11908.239566
## final value 11906.731402
## converged
## # weights: 157
## initial value 13350.412465
## iter 10 value 11906.633968
## iter 20 value 11095.954716
## iter 30 value 9683.448921
## iter 40 value 7686.861074
## iter 50 value 7578.406138
## iter 60 value 7492.783669
## iter 70 value 7323.950699
## iter 80 value 7110.543725
## iter 90 value 7044.011881
## iter 100 value 7022.457694
## final value 7022.457694
## stopped after 100 iterations
## # weights: 261
## initial value 13934.814064
## iter 10 value 11906.662805
## iter 20 value 11705.286812
## iter 30 value 11094.788359
## iter 40 value 10193.019569
## iter 50 value 7999.167677
## iter 60 value 7625.014795
## iter 70 value 7423.757480
## iter 80 value 7210.594483
## iter 90 value 7122.789138
## iter 100 value 7093.408667
## final value 7093.408667
## stopped after 100 iterations
## # weights: 53
## initial value 11907.036672
## final value 11906.732126
## converged
## # weights: 157
## initial value 12848.321059
## final value 11906.734001
## converged
## # weights: 261
## initial value 11939.407944
## final value 11906.735494
## converged
## # weights: 53
## initial value 12772.016621
## final value 11906.739643
## converged
## # weights: 157
## initial value 11977.295581
## final value 11906.739643
## converged
## # weights: 261
## initial value 16922.111689
## final value 11906.739643
## converged
## # weights: 53
## initial value 12240.183266
## iter 10 value 11906.618039
## iter 20 value 11886.180486
## iter 30 value 9534.564860
## iter 40 value 8758.934413
## iter 50 value 7794.743970
## iter 60 value 7407.167935
## iter 70 value 7316.817968
## iter 80 value 7303.563022
## iter 90 value 7261.626278
## iter 100 value 7214.808214
## final value 7214.808214
## stopped after 100 iterations
## # weights: 157
## initial value 12013.364729
## iter 10 value 11906.743020
## iter 20 value 11906.651000
## iter 30 value 11905.115068
## iter 40 value 11817.795528
## iter 50 value 11257.090431
## iter 60 value 10591.023171
## iter 70 value 9508.164516
## iter 80 value 8849.861726
## iter 90 value 8706.839109
## iter 100 value 8241.027224
## final value 8241.027224
## stopped after 100 iterations
## # weights: 261
## initial value 12400.250323
## iter 10 value 11905.069719
## iter 20 value 11375.646203
## iter 30 value 10915.460070
## iter 40 value 9608.289430
## iter 50 value 7902.581535
## iter 60 value 7405.309213
## iter 70 value 7321.362692
## iter 80 value 7287.334600
## iter 90 value 7220.346597
## iter 100 value 7214.996224
## final value 7214.996224
## stopped after 100 iterations
## # weights: 53
## initial value 12846.538539
## final value 11906.740468
## converged
## # weights: 157
## initial value 14624.893275
## final value 11906.742220
## converged
## # weights: 261
## initial value 12253.071724
## final value 11906.744199
## converged
## # weights: 261
## initial value 13046.092906
## iter 10 value 11906.618773
## iter 20 value 11848.034988
## iter 30 value 11662.315048
## iter 40 value 11229.716722
## iter 50 value 11108.793207
## iter 60 value 11098.992152
## iter 70 value 11081.423604
## iter 80 value 10980.371714
## iter 90 value 9280.964308
## iter 100 value 8091.121890
## final value 8091.121890
## stopped after 100 iterations
print(nn)
## Neural Network
##
## 17178 samples
## 20 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 17178, 17178, 17178, 17178, 17178, 17178, ...
## Resampling results across tuning parameters:
##
## size decay Accuracy Kappa
## 1 0e+00 0.4977082 0.0000000
## 1 1e-04 0.4977082 0.0000000
## 1 1e-01 0.6857579 0.3743859
## 3 0e+00 0.4977082 0.0000000
## 3 1e-04 0.4977082 0.0000000
## 3 1e-01 0.8117347 0.6247934
## 5 0e+00 0.4977082 0.0000000
## 5 1e-04 0.4977082 0.0000000
## 5 1e-01 0.8123242 0.6249784
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were size = 5 and decay = 0.1.
plotnet(nn)

#train
nnpredtrain <- predict(nn, train_data)
resulttrainnn <-table(predicted=nnpredtrain,true=train_data$y)
resulttrainnn
## true
## predicted 0 1
## 0 7429 1874
## 1 1194 6681
acctrainnn =sum(diag(resulttrainnn))/sum(resulttrainnn)
acctrainnn
## [1] 0.8213995
#test
nnpredtest <- predict(nn, test_data)
resulttestnn <-table(predicted=nnpredtest,true=test_data$y)
resulttestnn
## true
## predicted 0 1
## 0 7378 1772
## 1 1244 6782
acctestnn =sum(diag(resulttestnn))/sum(resulttestnn)
acctestnn
## [1] 0.8244061
png("nn.png",height=2500, width=3000)
plot(nn)
dev.off()
## quartz_off_screen
## 2
#AUC-ROC curve
#par(mfrow=c(1,2))
#pred5 <- prediction(pred.train.nn, train_data$y)
#perf5 <- performance(pred5,"tpr","fpr")
#plot(perf5, main = "ROC for NN with training data", col='darkslategray3')
#abline(0,1)
#pred6 <- prediction(preds.test.nn, test_data$y)
#perf6 <- performance(pred6,"tpr","fpr")
#plot(perf6, main = "ROC for NN with test data", col='darkslategray3')
#abline(0,1)
#AUC_NN_train <- auc(roc(train_data$y, pred.train.nn))
#AUC_NN_train # Train
#=============================================================
#Random Forest
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:rattle':
##
## importance
## The following object is masked from 'package:gridExtra':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
RF.model <- randomForest(y~., data=train_data, ntree=100, importance=TRUE)
RF.model
##
## Call:
## randomForest(formula = y ~ ., data = train_data, ntree = 100, importance = TRUE)
## Type of random forest: classification
## Number of trees: 100
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 13.42%
## Confusion matrix:
## 0 1 class.error
## 0 7653 970 0.1124899
## 1 1336 7219 0.1561660
summary(RF.model)
## Length Class Mode
## call 5 -none- call
## type 1 -none- character
## predicted 17178 factor numeric
## err.rate 300 -none- numeric
## confusion 6 -none- numeric
## votes 34356 matrix numeric
## oob.times 17178 -none- numeric
## classes 2 -none- character
## importance 80 -none- numeric
## importanceSD 60 -none- numeric
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 14 -none- list
## y 17178 factor numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## terms 3 terms call
#Next we display an error plot of the random forest model:
plot(RF.model)

RF.predict.train <- predict(RF.model, newdata = train_data)
RF.train.cm <- as.matrix(table(Actual1 = train_data$y, Predicted1 = RF.predict.train))
RF.train.cm
## Predicted1
## Actual1 0 1
## 0 8622 1
## 1 0 8555
accuracy_train_rf=sum(diag(RF.train.cm))/sum(RF.train.cm)
accuracy_train_rf
## [1] 0.9999418
library(knitr)
RF.predict <- predict(RF.model, newdata = test_data)
RF.cm <- as.matrix(table(Actual = test_data$y, Predicted = RF.predict))
RF.cm
## Predicted
## Actual 0 1
## 0 7658 964
## 1 1234 7320
kable(RF.cm, caption = "Random Forest Test Confusion Matrix")
Random Forest Test Confusion Matrix
| 0 |
7658 |
964 |
| 1 |
1234 |
7320 |
accuracy_test_rf=sum(diag(RF.cm))/sum(RF.cm)
accuracy_test_rf
## [1] 0.8720307
#Below we test the accuracy on the training and test datasets and we see that it is 90.87% and 83.45%, respectively.
#The “out of sample” error is 16.51% and is in agreement with the OOB error:
#library(randomForestExplainer)
#explain_forest(RF.model, interactions = TRUE, data = train_data)
#AUC-ROC curve
#RFROC.train<-roc(train_data$y,RF.predict.train)
#RFAUC.train<-RFROC.train$auc
#RFROC.test<-roc(test_data$y,RF.predict)
#RFAUC.test<-RFROC.test$auc
#=============================================
#Casual Inference part
## Conditional Inference Tree
library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
##
## Attaching package: 'modeltools'
## The following object is masked from 'package:car':
##
## Predict
## Loading required package: strucchange
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
##
## Attaching package: 'strucchange'
## The following object is masked from 'package:stringr':
##
## boundary
set.seed(1985)
#model training
ctree_model = ctree(y ~ .,data = train_data,controls=ctree_control(maxsurrogate=3))
ctree_model
##
## Conditional inference tree with 30 terminal nodes
##
## Response: y
## Inputs: age, job, marital, education, default, housing, loan, contact, month, day_of_week, campaign, pdays, previous, poutcome, emp.var.rate, cons.price.idx, cons.conf.idx, euribor3m, nr.employed, pred
## Number of observations: 17178
##
## 1) pred == {0}; criterion = 1, statistic = 7935.052
## 2) month == {apr, mar, oct}; criterion = 1, statistic = 378.34
## 3) day_of_week == {fri, mon}; criterion = 1, statistic = 79.423
## 4) month == {mar, oct}; criterion = 1, statistic = 40.916
## 5)* weights = 23
## 4) month == {apr}
## 6) marital == {single}; criterion = 0.998, statistic = 33.939
## 7) previous <= -0.08272794; criterion = 0.991, statistic = 12.384
## 8)* weights = 23
## 7) previous > -0.08272794
## 9)* weights = 82
## 6) marital == {divorced, married}
## 10) job == {admin., blue-collar, management, services, student, technician, unemployed}; criterion = 1, statistic = 44.446
## 11)* weights = 212
## 10) job == {entrepreneur, housemaid, retired, self-employed}
## 12)* weights = 35
## 3) day_of_week == {thu, tue, wed}
## 13) default == {refuse2disclose}; criterion = 1, statistic = 43.372
## 14) nr.employed <= 5119.292; criterion = 0.997, statistic = 14.372
## 15)* weights = 25
## 14) nr.employed > 5119.292
## 16)* weights = 10
## 13) default == {no}
## 17) job == {blue-collar, entrepreneur, management, services, unemployed}; criterion = 0.997, statistic = 34.856
## 18)* weights = 144
## 17) job == {admin., housemaid, retired, self-employed, student, technician}
## 19) age <= 51.2425; criterion = 0.99, statistic = 15.028
## 20)* weights = 244
## 19) age > 51.2425
## 21)* weights = 56
## 2) month == {aug, dec, jul, jun, may, nov}
## 22) contact == {telephone}; criterion = 1, statistic = 131.48
## 23) month == {aug, nov}; criterion = 1, statistic = 40.708
## 24)* weights = 141
## 23) month == {dec, jul, jun, may}
## 25)* weights = 3482
## 22) contact == {cellular}
## 26) month == {aug, dec, nov}; criterion = 1, statistic = 59.357
## 27)* weights = 2311
## 26) month == {jul, jun, may}
## 28) month == {jul}; criterion = 0.99, statistic = 17.539
## 29)* weights = 1497
## 28) month == {jun, may}
## 30) nr.employed <= 5141.647; criterion = 1, statistic = 24.29
## 31)* weights = 1382
## 30) nr.employed > 5141.647
## 32) nr.employed <= 5162.049; criterion = 0.973, statistic = 10.268
## 33)* weights = 83
## 32) nr.employed > 5162.049
## 34) housing == {no}; criterion = 0.988, statistic = 16.054
## 35)* weights = 17
## 34) housing == {refuse2disclose, yes}
## 36)* weights = 13
## 1) pred == {1}
## 37) month == {aug, jul, jun, may, nov, sep}; criterion = 1, statistic = 91.023
## 38) poutcome == {failure, nonexistent}; criterion = 1, statistic = 47.376
## 39) cons.conf.idx <= -33.11439; criterion = 1, statistic = 39.937
## 40) contact == {telephone}; criterion = 1, statistic = 51.815
## 41) euribor3m <= 1.979441; criterion = 1, statistic = 43.52
## 42)* weights = 165
## 41) euribor3m > 1.979441
## 43)* weights = 710
## 40) contact == {cellular}
## 44) month == {aug, jul, may, nov, sep}; criterion = 0.967, statistic = 19.329
## 45)* weights = 2601
## 44) month == {jun}
## 46)* weights = 621
## 39) cons.conf.idx > -33.11439
## 47) euribor3m <= 1.61896; criterion = 0.979, statistic = 12.581
## 48) cons.conf.idx <= -28.11941; criterion = 0.999, statistic = 15.761
## 49) euribor3m <= -0.313062; criterion = 0.995, statistic = 13.275
## 50)* weights = 43
## 49) euribor3m > -0.313062
## 51)* weights = 408
## 48) cons.conf.idx > -28.11941
## 52)* weights = 96
## 47) euribor3m > 1.61896
## 53)* weights = 185
## 38) poutcome == {success}
## 54) nr.employed <= 5101.956; criterion = 1, statistic = 23.066
## 55)* weights = 674
## 54) nr.employed > 5101.956
## 56)* weights = 135
## 37) month == {apr, dec, mar, oct}
## 57) campaign <= 4.274889; criterion = 0.975, statistic = 14.33
## 58)* weights = 1711
## 57) campaign > 4.274889
## 59)* weights = 49
plot(ctree_model,main="Conditional Inference Tree for Customers' Responses")

#train prediction
train_data$pred = predict(ctree_model, train_data)
#train confusion Matrix
confusionMatrix(train_data$y, factor(train_data$pred))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 7635 988
## 1 1651 6904
##
## Accuracy : 0.8464
## 95% CI : (0.8409, 0.8517)
## No Information Rate : 0.5406
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6926
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.8222
## Specificity : 0.8748
## Pos Pred Value : 0.8854
## Neg Pred Value : 0.8070
## Prevalence : 0.5406
## Detection Rate : 0.4445
## Detection Prevalence : 0.5020
## Balanced Accuracy : 0.8485
##
## 'Positive' Class : 0
##
#test prediction
test_data$pred = predict(ctree_model, test_data)
#test confusion Matrix
confusionMatrix(test_data$y, factor(test_data$pred))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 7552 1070
## 1 1578 6976
##
## Accuracy : 0.8458
## 95% CI : (0.8403, 0.8512)
## No Information Rate : 0.5316
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6916
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.8272
## Specificity : 0.8670
## Pos Pred Value : 0.8759
## Neg Pred Value : 0.8155
## Prevalence : 0.5316
## Detection Rate : 0.4397
## Detection Prevalence : 0.5020
## Balanced Accuracy : 0.8471
##
## 'Positive' Class : 0
##